@misc{videobert,
      title={VideoBERT: A Joint Model for Video and Language Representation Learning}, 
      author={Chen Sun and Austin Myers and Carl Vondrick and Kevin Murphy and Cordelia Schmid},
      year={2019},
      eprint={1904.01766},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{visualbert,
      title={VisualBERT: A Simple and Performant Baseline for Vision and Language}, 
      author={Liunian Harold Li and Mark Yatskar and Da Yin and Cho-Jui Hsieh and Kai-Wei Chang},
      year={2019},
      eprint={1908.03557},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{vlbert,
      title={VL-BERT: Pre-training of Generic Visual-Linguistic Representations}, 
      author={Weijie Su and Xizhou Zhu and Yue Cao and Bin Li and Lewei Lu and Furu Wei and Jifeng Dai},
      year={2020},
      eprint={1908.08530},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{unicodervl,
      title={Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training}, 
      author={Gen Li and Nan Duan and Yuejian Fang and Ming Gong and Daxin Jiang and Ming Zhou},
      year={2019},
      eprint={1908.06066},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{uniter,
      title={UNITER: UNiversal Image-TExt Representation Learning}, 
      author={Yen-Chun Chen and Linjie Li and Licheng Yu and Ahmed El Kholy and Faisal Ahmed and Zhe Gan and Yu Cheng and Jingjing Liu},
      year={2020},
      eprint={1909.11740},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{oscar,
      title={Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks}, 
      author={Xiujun Li and Xi Yin and Chunyuan Li and Pengchuan Zhang and Xiaowei Hu and Lei Zhang and Lijuan Wang and Houdong Hu and Li Dong and Furu Wei and Yejin Choi and Jianfeng Gao},
      year={2020},
      eprint={2004.06165},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{vinvl,
      title={VinVL: Revisiting Visual Representations in Vision-Language Models}, 
      author={Pengchuan Zhang and Xiujun Li and Xiaowei Hu and Jianwei Yang and Lei Zhang and Lijuan Wang and Yejin Choi and Jianfeng Gao},
      year={2021},
      eprint={2101.00529},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{villa,
      title={Large-Scale Adversarial Training for Vision-and-Language Representation Learning}, 
      author={Zhe Gan and Yen-Chun Chen and Linjie Li and Chen Zhu and Yu Cheng and Jingjing Liu},
      year={2020},
      eprint={2006.06195},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{m6,
      title={M6: A Chinese Multimodal Pretrainer}, 
      author={Junyang Lin and Rui Men and An Yang and Chang Zhou and Ming Ding and Yichang Zhang and Peng Wang and Ang Wang and Le Jiang and Xianyan Jia and Jie Zhang and Jianwei Zhang and Xu Zou and Zhikang Li and Xiaodong Deng and Jie Liu and Jinbao Xue and Huiling Zhou and Jianxin Ma and Jin Yu and Yong Li and Wei Lin and Jingren Zhou and Jie Tang and Hongxia Yang},
      year={2021},
      eprint={2103.00823},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{m3p,
      title={M3P: Learning Universal Representations via Multitask Multilingual Multimodal Pre-training}, 
      author={Minheng Ni and Haoyang Huang and Lin Su and Edward Cui and Taroon Bharti and Lijuan Wang and Jianfeng Gao and Dongdong Zhang and Nan Duan},
      year={2021},
      eprint={2006.02635},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{uc2,
      title={UC2: Universal Cross-lingual Cross-modal Vision-and-Language Pre-training}, 
      author={Mingyang Zhou and Luowei Zhou and Shuohang Wang and Yu Cheng and Linjie Li and Zhou Yu and Jingjing Liu},
      year={2021},
      eprint={2104.00332},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{soho,
      title={Seeing Out of tHe bOx: End-to-End Pre-training for Vision-Language Representation Learning}, 
      author={Zhicheng Huang and Zhaoyang Zeng and Yupan Huang and Bei Liu and Dongmei Fu and Jianlong Fu},
      year={2021},
      eprint={2104.03135},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{vilt,
      title={ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision}, 
      author={Wonjae Kim and Bokyung Son and Ildoo Kim},
      year={2021},
      eprint={2102.03334},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@misc{vilbert,
      title={ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks}, 
      author={Jiasen Lu and Dhruv Batra and Devi Parikh and Stefan Lee},
      year={2019},
      eprint={1908.02265},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{lxmert,
      title={LXMERT: Learning Cross-Modality Encoder Representations from Transformers}, 
      author={Hao Tan and Mohit Bansal},
      year={2019},
      eprint={1908.07490},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{cbt,
      title={Learning Video Representations using Contrastive Bidirectional Transformer}, 
      author={Chen Sun and Fabien Baradel and Kevin Murphy and Cordelia Schmid},
      year={2019},
      eprint={1906.05743},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{clip,
      title={Learning Transferable Visual Models From Natural Language Supervision}, 
      author={Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
      year={2021},
      eprint={2103.00020},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{12in1,
      title={12-in-1: Multi-Task Vision and Language Representation Learning}, 
      author={Jiasen Lu and Vedanuj Goswami and Marcus Rohrbach and Devi Parikh and Stefan Lee},
      year={2020},
      eprint={1912.02315},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{unit,
      title={UniT: Multimodal Multitask Learning with a Unified Transformer}, 
      author={Ronghang Hu and Amanpreet Singh},
      year={2021},
      eprint={2102.10772},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wenlan,
      title={WenLan: Bridging Vision and Language by Large-Scale Multi-Modal Pre-Training}, 
      author={Yuqi Huo and Manli Zhang and Guangzhen Liu and Haoyu Lu and Yizhao Gao and Guoxing Yang and Jingyuan Wen and Heng Zhang and Baogui Xu and Weihao Zheng and Zongzheng Xi and Yueqian Yang and Anwen Hu and Jinming Zhao and Ruichen Li and Yida Zhao and Liang Zhang and Yuqing Song and Xin Hong and Wanqing Cui and Danyang Hou and Yingyan Li and Junyi Li and Peiyu Liu and Zheng Gong and Chuhao Jin and Yuchong Sun and Shizhe Chen and Zhiwu Lu and Zhicheng Dou and Qin Jin and Yanyan Lan and Wayne Xin Zhao and Ruihua Song and Ji-Rong Wen},
      year={2021},
      eprint={2103.06561},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{align,
      title={Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision}, 
      author={Chao Jia and Yinfei Yang and Ye Xia and Yi-Ting Chen and Zarana Parekh and Hieu Pham and Quoc V. Le and Yunhsuan Sung and Zhen Li and Tom Duerig},
      year={2021},
      eprint={2102.05918},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{cookie,
  title={COOKIE: Contrastive Cross-Modal Knowledge Sharing Pre-Training for Vision-Language Representation},
  author={Wen, Keyu and Xia, Jin and Huang, Yuanyuan and Li, Linyang and Xu, Jiayan and Shao, Jie},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={2208--2217},
  year={2021}
}

@misc{zerovl,
      title={ZeroVL: A Strong Baseline for Aligning Vision-Language Representations with Limited Resources}, 
      author={Quan Cui and Boyan Zhou and Yu Guo and Weidong Yin and Hao Wu and Osamu Yoshie},
      year={2021},
      eprint={2112.09331},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{clipbert,
      title={Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling}, 
      author={Jie Lei and Linjie Li and Luowei Zhou and Zhe Gan and Tamara L. Berg and Mohit Bansal and Jingjing Liu},
      year={2021},
      eprint={2102.06183},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}